/hps/research1/birney/users/ian/rac_hyplibrary(here)
library(pegas)
library(tidyverse)
library(reshape2)
library(readxl)
library(plotly)
library(ggridges)
cd /nfs/software/birney
wget https://github.com/broadinstitute/gatk/releases/download/4.1.4.1/gatk-4.1.4.1.zip
unzip gatk-4.1.4.1.zip
# amend aliases in ~/.bashrc and ~/.bash_profile
export PATH=$PATH:/nfs/software/birney/gatk-4.1.4.1/
wget -r -p -k --no-parent -cut-dirs=5 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
find vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chr*.vcf.gz > racist_hypothesis/data/20200205_vcfs.list
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=racist_hypothesis/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz are not compatible with the others.
# So remove that one from list above
sed -i '/MT/d' racist_hypothesis/data/20200205_vcfs.list
# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=racist_hypothesis/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# Exception in thread "main" java.lang.IllegalArgumentException: The contig entries in input file /hps/research1/birney/users/ian/rac_hyp/vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf.gz are not compatible with the others.
sed -i '/chrY/d' racist_hypothesis/data/20200205_vcfs.list
# run MergeVCFs again
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
I=racist_hypothesis/data/20200205_vcfs.list \
O=vcfs/1gk_all.vcf.gz
# SUCCESS
From Lee et al. (2019) Gene discovery and polygenic prediction from a genome-wide association study of educational attainment in 1.1 million individuals, Nature: https://www.nature.com/articles/s41588-018-0147-3.
Data download link: https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-018-0147-3/MediaObjects/41588_2018_147_MOESM3_ESM.xlsx
Saved here: data/20180723_Lee-et-al_supp-tables.xlsx
From Yengo et al. (2018) Meta-analysis of genome-wide association studies for height and body mass index in approximately 700000 individuals of European ancestry, Human Molecular Genetics: https://academic.oup.com/hmg/article-abstract/27/20/3641/5067845.
Data downloaded from this webpage: https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files Data download link: https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
cd racist_hypothesis/data
# download
wget https://portals.broadinstitute.org/collaboration/giant/images/e/e2/Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# unzip
gunzip Meta-analysis_Locke_et_al%2BUKBiobank_2018_top_941_from_COJO_analysis_UPDATED.txt.gz
# rename
mv Meta-analysis_Wood_et_al+UKBiobank_2018_top_3290_from_COJO_analysis.txt 20181015_Yengo-et-al_snps_height.txt
Saved here: data/20181015_Yengo-et-al_snps_height.txt
data/20171117_Crawford-et-al_Table-1.xlsxdata/20190121_Adhikari-et-al_snps.xlsxdata/20170316_Hernandez-Pacheco-et-al.xlsx.Others compiled into the single XLSX doc data/20200622_pigmentation_snps.xlsx:
pig_snps <- list()
# Crawford
pig_snps[["crawford"]] <- readxl::read_excel(here("data", "20171117_Crawford-et-al_Table-1.xlsx")) %>%
dplyr::select(rsid = "RSID", everything()) %>%
dplyr::filter(!is.na(rsid),
!rsid == ".")
# Adhikari
pig_snps[["adhikari_tbl_1"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "Table 1",
skip = 1) %>%
dplyr::select(rsid = "rsID", everything()) %>%
dplyr::filter(!is.na(rsid))
pig_snps[["adhikari_supp_6"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "supp_table_6") %>%
dplyr::select(rsid = "SNP", everything()) %>%
dplyr::filter(!is.na(rsid))
pig_snps[["adhikari_supp_12"]] <- readxl::read_excel(here("data", "20190121_Adhikari-et-al_snps.xlsx"),
sheet = "supp_table_12") %>%
dplyr::select(rsid = "SNP", everything()) %>%
dplyr::filter(!is.na(rsid))
# Hernandez-Pacheco
pig_snps[["hernandez-pacheco"]] <- readxl::read_excel(here("data", "20170316_Hernandez-Pacheco-et-al.xlsx"))
# Doc with SNPs from multiple studies
sheet_names <- readxl::excel_sheets(here("data", "20200622_pigmentation_snps.xlsx"))
compiled_snps <- lapply(sheet_names, function(x){
x <- readxl::read_excel(here("data", "20200622_pigmentation_snps.xlsx"),
sheet = x)
})
names(compiled_snps) <- sheet_names
# Combine
pig_snps <- c(pig_snps, compiled_snps)
pig_df <- lapply(pig_snps, function(x) dplyr::select(x, rsid))
pig_df <- dplyr::bind_rows(pig_df)
pig_df <- unique(pig_df)
nrow(pig_df)
## [1] 266
head(pig_df)
## # A tibble: 6 x 1
## rsid
## <chr>
## 1 rs2413887
## 2 rs1426654
## 3 rs1834640
## 4 rs2675345
## 5 rs8028919
## 6 rs10424065
# extract from excel doc
snps_eduyrs <- read_xlsx(here::here("data", "20180723_Lee-et-al_supp-tables.xlsx"), sheet = "2. EduYears Lead SNPs", skip = 1, n_max = 1271)
# write table of SNPs
write.table(snps_eduyrs[["SNP"]], here::here("data", "20200316_snps_eduyears.list"), quote = F, row.names = F, col.names = F)
gatk SelectVariants \
-R refs/hs37d5.fa.gz \
-V vcfs/1gk_all.vcf.gz \
--keep-ids racist_hypothesis/data/20200316_snps_eduyears.list \
-O vcfs/snphits_eduyrs.vcf.gz
cut -f1 racist_hypothesis/data/20181015_Yengo-et-al_snps_height.txt | tail -n+2 > racist_hypothesis/data/20200318_snps_height.list
gatk SelectVariants \
-R refs/hs37d5.fa.gz \
-V vcfs/1gk_all.vcf.gz \
--keep-ids racist_hypothesis/data/20200318_snps_height.list \
-O vcfs/snphits_height.vcf.gz
write.table(x = pig_df$rsid,
file = here::here("data", "20200622_snps_pig.list"),
quote = F,
row.names = F,
col.names = F)
gatk SelectVariants \
-R refs/hs37d5.fa.gz \
-V vcfs/1gk_all.vcf.gz \
--keep-ids racist_hypothesis/data/20200622_snps_pig.list \
-O vcfs/snphits_pig.vcf.gz
Plink2Downloded via this page: http://www.internationalgenome.org/data Download link: http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.xlsx.
meta <- read_xlsx(here::here("data", "20130606_sample_info.xlsx"), sheet = "Sample Info") %>% dplyr::select(Sample, Population, Gender)
meta
## # A tibble: 3,500 x 3
## Sample Population Gender
## <chr> <chr> <chr>
## 1 HG00096 GBR male
## 2 HG00097 GBR female
## 3 HG00098 GBR male
## 4 HG00099 GBR female
## 5 HG00100 GBR female
## 6 HG00101 GBR male
## 7 HG00102 GBR female
## 8 HG00103 GBR male
## 9 HG00104 GBR female
## 10 HG00105 GBR male
## # … with 3,490 more rows
write.table(meta[, 1:2],
here::here("data", "plink2_sample_popn_key.txt"),
quote = F,
sep = "\t",
row.names = F,
col.names = F)
mkdir racist_hypothesis/data/20200622_plink2_alfreqs
mkdir racist_hypothesis/data/20200622_plink2_alfreqs/hei
mkdir racist_hypothesis/data/20200622_plink2_alfreqs/edu
mkdir racist_hypothesis/data/20200622_plink2_alfreqs/pig
# Edu Years
/nfs/software/birney/plink2.3/plink2 \
--vcf vcfs/snphits_eduyrs.vcf.gz \
--freq \
--max-alleles 2 \
--pheno iid-only racist_hypothesis/data/plink2_sample_popn_key.txt \
--loop-cats PHENO1 \
--out racist_hypothesis/data/20200622_plink2_alfreqs/edu/edu
# Height
/nfs/software/birney/plink2.3/plink2 \
--vcf vcfs/snphits_height.vcf.gz \
--freq \
--max-alleles 2 \
--pheno iid-only racist_hypothesis/data/plink2_sample_popn_key.txt \
--loop-cats PHENO1 \
--out racist_hypothesis/data/20200622_plink2_alfreqs/hei/hei
# Pigmentation
/nfs/software/birney/plink2.3/plink2 \
--vcf vcfs/snphits_pig.vcf.gz \
--freq \
--max-alleles 2 \
--pheno iid-only racist_hypothesis/data/plink2_sample_popn_key.txt \
--loop-cats PHENO1 \
--out racist_hypothesis/data/20200622_plink2_alfreqs/pig/pig
target_dirs <- list.dirs(here::here("data", "20200622_plink2_alfreqs"), recursive = F)
al_freq_lst <- lapply(target_dirs, function(x){
target_files <- list.files(x, pattern = ".afreq", full.names = T)
# read in data
data_lst <- lapply(target_files, function(target_file){
read.table(target_file,
header = T,
comment.char = "")
})
# fix names of populations
names(data_lst) <- gsub(pattern = "edu.|hei.|pig.|.afreq",
replacement = "",
x = list.files(x, pattern = ".afreq"))
return(data_lst)
})
# set names
names(al_freq_lst) <- basename(target_dirs)
al_freq_df <- lapply(al_freq_lst, function(pheno){
out <- dplyr::bind_rows(pheno, .id = "population") %>%
tidyr::pivot_wider(id_cols = c(X.CHROM, ID, REF, ALT),
names_from = population,
values_from = ALT_FREQS)
})
set.seed(65)
rdm_sds <- sample(1:100, 3)
counter <- 0
al_freq_df_shuff <- lapply(al_freq_df, function(pheno){
counter <<- counter + 1
# set seed
set.seed(rdm_sds[counter])
# select SNPs to swap (half of total)
tgt_indcs <- sample(nrow(pheno), nrow(pheno) /2)
# swap minor alleles
pheno[tgt_indcs, 5:ncol(pheno)] <- 1 - pheno[tgt_indcs, 5:ncol(pheno)]
# return pheno
return(pheno)
})
# Set up titles vector
titles <- c("Educational Attainment", "Height", "Pigmentation")
counter <- 0
lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
ggplot(pheno,
aes(YRI, CHS)) +
geom_point(size = 0.5) +
coord_fixed() +
geom_smooth(se = F, colour = "red") +
geom_abline(intercept = 0, slope = 1, colour = "blue") +
xlab("Allele frequency in YRI") +
ylab("Allele frequency in CHS") +
labs(title = titles[counter])
})
## $edu
##
## $hei
##
## $pig
counter <- 0
lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
ggplot(pheno,
aes(YRI, CEU)) +
geom_point(size = 0.5) +
coord_fixed() +
geom_smooth(se = F, colour = "red") +
geom_abline(intercept = 0, slope = 1, colour = "blue") +
xlab("Allele frequency in YRI") +
ylab("Allele frequency in CEU") +
labs(title = titles[counter])
})
## $edu
##
## $hei
##
## $pig
colourscales <- c("Viridis", "Hot", "Electric")
titles <- c("Educational Attainment", "Height", "Skin/hair pigmentation")
counter <- 0
plts <- lapply(al_freq_df_shuff, function(pheno){
counter <<- counter + 1
# set graph resolution
graph_reso <- 0.05
# get lm for data
loess_model <- loess(CEU ~ 0 + CHS + YRI, data = pheno)
# set up axes
axis_x <- seq(min(pheno$CHS), max(pheno$CHS), by = graph_reso)
axis_y <- seq(min(pheno$YRI), max(pheno$YRI), by = graph_reso)
# sample points
lm_surface <- expand.grid(CHS = axis_x,
YRI = axis_y,
KEEP.OUT.ATTRS = F)
lm_surface$CEU <- predict(loess_model, newdata = lm_surface)
lm_surface <- acast(lm_surface, YRI ~ CHS, value.var = "CEU")
# create plot
plt <- plot_ly(pheno,
x = ~CHS,
y = ~YRI,
z = ~CEU,
type = "scatter3d",
mode = "markers",
marker = list(size = 2),
text = pheno$ID)
plt <- add_trace(plt,
z = lm_surface,
x = axis_x,
y = axis_y,
type = "surface",
colorscale = colourscales[counter]) %>%
layout(title = titles[counter])
return(plt)
})
plts$edu
plts$hei
plts$pig
# list target VCFs
target_vcfs <- list.files(here::here("data"),
pattern = glob2rx("snphits_*.gz"),
full.names = T)
# filter for the three we want
target_vcfs <- target_vcfs[grep("eduyrs|height|pig", target_vcfs)]
# Create raw list of variants
vcf_list_raw <- lapply(target_vcfs, function(vcf_file){
vcf_out <- pegas::read.vcf(vcf_file)
})
# Create vector of populations
populations <- unlist(lapply(rownames(vcf_list_raw[[1]]), function(sample){
meta$Population[meta$Sample == sample]
}))
# Generate Fst stats
fst_out_lst <- lapply(vcf_list_raw, function(pheno){
as.data.frame(pegas::Fst(pheno, pop = populations))
})
# make rownames into separate column
fst_out_lst <- lapply(fst_out_lst, function(pheno){
pheno$snp <- rownames(pheno)
return(pheno)
})
names(fst_out_lst) <- titles
# bind into single DF
fst_out_df <- dplyr::bind_rows(fst_out_lst, .id = "phenotype")
head(fst_out_df)
ggplot(fst_out_df, aes(Fst, fill = phenotype)) +
geom_density(alpha = 0.7) +
labs(fill = "Phenotype") +
ylab("Density") +
theme_bw() +
scale_fill_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
# factorise
fst_out_df$phenotype <- factor(fst_out_df$phenotype,
levels = c("Skin/hair pigmentation", "Height", "Educational Attainment"))
ggplot() +
geom_density_ridges2(data = fst_out_df,
mapping = aes(x = Fst, y = phenotype, fill = phenotype),
scale = 2) +
scale_fill_manual(values = c("#FC4E07", "#00AFBB", "#E7B800")) +
ylab(label = NULL) +
theme_bw() +
guides(fill = guide_legend(reverse=T,
title = "Phenotype")) +
scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))
## Picking joint bandwidth of 0.0225
# get samples from target popns only
target_popns <- which(populations %in% c("YRI", "CEU", "CHS"))
populations_3pop <- populations[target_popns]
vcf_list_raw_3pop <- lapply(vcf_list_raw, function(pheno){
pheno[target_popns, ]
})
# Generate Fst stats
fst_out_lst_3pop <- lapply(vcf_list_raw_3pop, function(pheno){
as.data.frame(pegas::Fst(pheno, pop = populations_3pop))
})
# make rownames into separate column
fst_out_lst_3pop <- lapply(fst_out_lst_3pop, function(pheno){
pheno$snp <- rownames(pheno)
return(pheno)
})
names(fst_out_lst_3pop) <- titles
# bind into single DF
fst_out_df_3pop <- dplyr::bind_rows(fst_out_lst_3pop, .id = "phenotype")
head(fst_out_df_3pop)
## phenotype Fit Fst Fis snp
## rs780569 Educational Attainment 1 0.10356157 1 rs780569
## rs34394051 Educational Attainment 1 0.02428879 1 rs34394051
## rs11121177 Educational Attainment 1 0.14444609 1 rs11121177
## rs4846010 Educational Attainment 1 0.09716410 1 rs4846010
## rs78116078 Educational Attainment 1 0.17017201 1 rs78116078
## rs10799615 Educational Attainment 1 0.17098100 1 rs10799615
ggplot(fst_out_df_3pop, aes(Fst, fill = phenotype)) +
geom_density(alpha = 0.7) +
labs(fill = "Phenotype") +
ylab("Density") +
theme_bw() +
scale_fill_manual(values = c("#00AFBB", "#E7B800", "#FC4E07"))
# factorise
fst_out_df_3pop$phenotype <- factor(fst_out_df_3pop$phenotype,
levels = c("Skin/hair pigmentation", "Height", "Educational Attainment"))
ggplot() +
geom_density_ridges2(data = fst_out_df_3pop,
mapping = aes(x = Fst, y = phenotype, fill = phenotype),
scale = 2) +
scale_fill_manual(values = c("#FC4E07", "#00AFBB", "#E7B800")) +
ylab(label = NULL) +
theme_bw() +
guides(fill = guide_legend(reverse=T,
title = "Phenotype")) +
scale_y_discrete(expand = expand_scale(add = c(0.2, 2.3)))
## Picking joint bandwidth of 0.0397